## estimates the difference between ATE for women and ATE for men and
## calculates credible intervals. Outcome is application. All analysese
## are Bayesian analyses. Estimates are all intent-to-treat
## estimates.
##
## table of results has the following columns (in order):
##
## subgroup    estimated difference    95% CI    
##
##
## Cait Unkovic, Maya Sen, Kevin Quinn
##
## 12/15/2014
##

set.seed(20384)


## utility functions

## tabulate data (used to format data for robins.ci() )
tabulate.data <- function(data, outcome="applied"){
  sex.vals <- unique(data$sex)
  univ.vals <- unique(data$univ)

  univ <- rep(NA, length(univ.vals)*2)
  sex <- rep(NA, length(univ.vals)*2)
  X0Y0 <- rep(NA, length(univ.vals)*2)
  X0Y1 <- rep(NA, length(univ.vals)*2)
  X1Y0 <- rep(NA, length(univ.vals)*2)
  X1Y1 <- rep(NA, length(univ.vals)*2)
  n.treated <- rep(NA, length(univ.vals)*2)
  n.control <- rep(NA, length(univ.vals)*2)
  n.units <- rep(NA, length(univ.vals)*2)
  
  counter <- 1
  for (u in univ.vals){
    for (s in sex.vals){
      data.sub <- data[data$univ == u & data$sex == s,]
      treated.data <- na.omit(data.sub[data.sub$treat==1,outcome])
      control.data <- na.omit(data.sub[data.sub$treat==0,outcome])

      n.treated[counter] <- length(treated.data)
      n.control[counter] <- length(control.data)            
      n.units[counter] <- n.treated[counter] + n.control[counter]
      univ[counter] <- u
      sex[counter] <- s
      X0Y0[counter] <- sum(control.data==0)
      X0Y1[counter] <- sum(control.data==1)
      X1Y0[counter] <- sum(treated.data==0)
      X1Y1[counter] <- sum(treated.data==1)
      
      counter <- counter + 1
      }
  }
  
  return(data.frame(univ, sex, n.units, n.treated, n.control, X0Y0, X0Y1,
                    X1Y0, X1Y1, stringsAsFactors=FALSE))

}






## calculates Bayesian posterior mean and 95% HPD region for
## ATE_women - ATE_men
## data.tab should be constructed by tabulate.data()
##
## stratified estimator
bayes.sex.diff <- function(data.tab, subset=NULL, M=100000){
  library(coda)
  if (is.null(subset)){
    subset=rep(TRUE, nrow(data.tab))
  }
  ## error checking for subset
  if (length(subset) != nrow(data.tab)){
    stop("length of subset not equal to number of university-sex combinations")
  }
  if (!is.logical(subset)){
    stop("subset is not a logical vector")
  }
  data.tab <- data.tab[subset,]

  
  data.tab.male <- data.tab[data.tab$sex=="Male",]
  data.tab.female <- data.tab[data.tab$sex=="Female",]
  total.male <- sum(data.tab.male$n.units)
  total.female <- sum(data.tab.female$n.units)
  
  SATE.male <- rep(0, M)
  SATE.female <- rep(0, M)
  SATE.dif <- rep(0, M)

  counter <- 1
  for (u in unique(data.tab$univ)){
    data.tab.sub <- data.tab[data.tab$univ==u,]
    data.tab.sub.male <- data.tab.sub[data.tab.sub$sex=="Male",]
    data.tab.sub.female <- data.tab.sub[data.tab.sub$sex=="Female",]
   
    ## male effects
    theta.treated <- rbeta(M, data.tab.sub.male$X1Y1+0.1,
                           data.tab.sub.male$X1Y0+0.1)
    theta.control <- rbeta(M, data.tab.sub.male$X0Y1+0.1,
                           data.tab.sub.male$X0Y0+0.1)

    ## sum of Y(0) among the control units
    ## data.tab.sub.male$X0Y1
    
    ## sum of Y(1) among the treated units
    ## data.tab.sub.male$X1Y1
    
    ## sum of (simulated) Y(0) among the treated units
    Y0.sim <- rbinom(M, size=data.tab.sub.male$n.treated, prob=theta.control)
    
    ## sum of (simulated) Y(1) among the control units
    Y1.sim <- rbinom(M, size=data.tab.sub.male$n.control, prob=theta.treated)

    CATE.male <- (data.tab.sub.male$X1Y1 + Y1.sim) / data.tab.sub.male$n.units -
      (data.tab.sub.male$X0Y1 + Y0.sim) / data.tab.sub.male$n.units
    
    SATE.male <- SATE.male + CATE.male * (data.tab.sub.male$n.units / total.male)

    
    
    ## female effects
    theta.treated <- rbeta(M, data.tab.sub.female$X1Y1+0.1,
                           data.tab.sub.female$X1Y0+0.1)
    theta.control <- rbeta(M, data.tab.sub.female$X0Y1+0.1,
                           data.tab.sub.female$X0Y0+0.1)

    ## sum of Y(0) among the control units
    ## data.tab.sub.female$X0Y1
    
    ## sum of Y(1) among the treated units
    ## data.tab.sub.female$X1Y1
    
    ## sum of (simulated) Y(0) among the treated units
    Y0.sim <- rbinom(M, size=data.tab.sub.female$n.treated, prob=theta.control)
    
    ## sum of (simulated) Y(1) among the control units
    Y1.sim <- rbinom(M, size=data.tab.sub.female$n.control, prob=theta.treated)

    CATE.female <- (data.tab.sub.female$X1Y1 + Y1.sim) / data.tab.sub.female$n.units - (data.tab.sub.female$X0Y1 + Y0.sim) / data.tab.sub.female$n.units

    SATE.female <- SATE.female +
      CATE.female * (data.tab.sub.female$n.units / total.female)
       
    counter <- counter + 1
  }

  SATE.dif <- SATE.female - SATE.male
  

  SATE.male.mean <- mean(SATE.male)
  SATE.female.mean <- mean(SATE.female)
  SATE.dif.mean <- mean(SATE.dif)
  SATE.dif.CI <- HPDinterval(as.mcmc(SATE.dif), prob=0.95)
  SATE.dif.probGT0 <- mean(SATE.dif > 0)
  
  output <- c(SATE.male.mean, SATE.female.mean, SATE.dif.mean,
              SATE.dif.CI[,"lower"], SATE.dif.CI[,"upper"], SATE.dif.probGT0)
  names(output) <- c("SATE.male.mean", "SATE.female.mean", "SATE.dif.mean",
                     "SATE.dif.lower", "SATE.dif.upper", "Prob SATE > 0")
  
  return(output)
}




















###################################################################
## read data in and do some processing of the raw data
x <- read.csv("gradcontacts_treatedcontrol_import.csv",
              stringsAsFactors=FALSE)

names(x) <- c("univ", "sex", "ID", "treat", "applied", "accept",
              "app.sex", "app.race")

## replace'2' as an acceptance outcome -- the original dataset had three levels
## but only used two (0 indicates not accepted for any participation,
##                    2 indicates accepted for poster presentation)
x$accept[x$accept==2] <- 1

## replace 9 as an outcome for accept --
## I used 9 to indicate someonw who didn't apply and couldn't
## be accepted when hand merging the datasets
x$accept[x$accept==9] <- 0


## attach the school rank data
univlist <- read.csv("university.list.csv",
                     stringsAsFactors=FALSE)
## clean up a bad entry
univlist <- univlist[1:53,]
## convert rank to numeric
univlist$u.rank <- as.numeric(univlist$u.rank)

x$Dept.Rank <- NULL
for (u in unique(x$univ)){
  x$Dept.Rank[x$univ == u] <- univlist$u.rank[univlist$univ == u]
}










###################################################################
## stratified analysis

output.table <- data.frame(subgroup=c("full sample",
                             "top 10",
                             "top 11 to 25",
                             "top 26 to 50"),
                           male.SATE=NA, female.SATE=NA,
                           estimated.difference=NA,
                           lower95=NA, upper95=NA, probGT0=NA,
                           stringsAsFactors=FALSE)


## full sample difference
tab.data <- tabulate.data(x, outcome="applied")
bayes.out <- bayes.sex.diff(tab.data)
output.table$subgroup[1] <- "full sample"
output.table$male.SATE[1] <- bayes.out["SATE.male.mean"]
output.table$female.SATE[1] <- bayes.out["SATE.female.mean"]
output.table$estimated.difference[1] <- bayes.out["SATE.dif.mean"]
output.table$lower95[1] <- bayes.out["SATE.dif.lower"]
output.table$upper95[1] <- bayes.out["SATE.dif.upper"]
output.table$probGT0[1] <- bayes.out["Prob SATE > 0"]







## top-10 programs difference
top10 <- univlist$univ[univlist$u.rank <= 10]
tab.data <- tabulate.data(x, outcome="applied")
bayes.out <- bayes.sex.diff(tab.data, subset=tab.data$univ %in% top10)
output.table$subgroup[2] <- "Top 10"
output.table$male.SATE[2] <- bayes.out["SATE.male.mean"]
output.table$female.SATE[2] <- bayes.out["SATE.female.mean"]
output.table$estimated.difference[2] <- bayes.out["SATE.dif.mean"]
output.table$lower95[2] <- bayes.out["SATE.dif.lower"]
output.table$upper95[2] <- bayes.out["SATE.dif.upper"]
output.table$probGT0[2] <- bayes.out["Prob SATE > 0"]







## 11-25 programs difference
top11.25 <- univlist$univ[univlist$u.rank <= 25 & univlist$u.rank >=11]
tab.data <- tabulate.data(x, outcome="applied")
bayes.out <- bayes.sex.diff(tab.data, subset=tab.data$univ %in% top11.25)
output.table$subgroup[3] <- "Top 11 to 25"
output.table$male.SATE[3] <- bayes.out["SATE.male.mean"]
output.table$female.SATE[3] <- bayes.out["SATE.female.mean"]
output.table$estimated.difference[3] <- bayes.out["SATE.dif.mean"]
output.table$lower95[3] <- bayes.out["SATE.dif.lower"]
output.table$upper95[3] <- bayes.out["SATE.dif.upper"]
output.table$probGT0[3] <- bayes.out["Prob SATE > 0"]





## 26-50 program difference
top26.50 <- univlist$univ[univlist$u.rank <= 50 & univlist$u.rank >=26]
tab.data <- tabulate.data(x, outcome="applied")
bayes.out <- bayes.sex.diff(tab.data, subset=tab.data$univ %in% top26.50)
output.table$subgroup[4] <- "Top 26 to 50"
output.table$male.SATE[4] <- bayes.out["SATE.male.mean"]
output.table$female.SATE[4] <- bayes.out["SATE.female.mean"]
output.table$estimated.difference[4] <- bayes.out["SATE.dif.mean"]
output.table$lower95[4] <- bayes.out["SATE.dif.lower"]
output.table$upper95[4] <- bayes.out["SATE.dif.upper"]
output.table$probGT0[4] <- bayes.out["Prob SATE > 0"]




save(output.table, file="ATESexDifferenceSummary.Rda")






library(xtable)

print(xtable(output.table, display=c("s", "s", "f", "f", "f", "f", "f", "f"),
             digits=c(10, 10, 3, 3, 3, 3, 3, 3)),
      include.rownames=FALSE)





